One of the most useful applications of RNA-seq is the identification of genes that are differentially expressed between two or more biological conditions of interest. This can be achieved by analysing whether changes in read count data between groups is greater than what would be expected just due to natural random variation.
In general, the differential gene expression (DGE) analysis can be performed in seven steps:
#rm(list=ls(all=TRUE))
library(tidyverse)
library(devtools)
library(statmod)
library(ggplot2)
library(magrittr)# this will allow us to string commands (UNIX pipe) like fashion using % >%
library(edgeR)
library(GOstats)
library(RSQLite)
library(Biobase)
library(gplots)
library(ggplot2)
library(WGCNA)
library(GSEABase)
library(flashClust)
library(preprocessCore)
library(RColorBrewer)
library(data.table)
library(ggplot2)
library(plotly)
library(limma)
library(dplyr)
library(kableExtra)
library(gage)
library(rGTExNetwork)
allowWGCNAThreads()
## Allowing multi-threading with up to 4 threads.
# Read count and data handlyng
#Make the first column as as rownames
gene_counts <- read.csv("genes_counts.csv", row.names=1)
counts<-gene_counts
#Take a look at the column names
gene_descriptions <- read.delim("gene_descriptions.txt")
#Naming the groups
Factors <- read_tsv("factors.tsv", col_types = cols()) %>% filter(sex == "female")
## Warning: package 'bindrcpp' was built under R version 3.4.4
head(Factors)
## # A tibble: 6 x 4
## sample age sex caste
## <chr> <int> <chr> <chr>
## 1 SRR3102934 2 female worker
## 2 SRR3123272 2 female worker
## 3 SRR3123273 2 female worker
## 4 SRR3123275 2 female worker
## 5 SRR3123276 2 female worker
## 6 SRR3123277 2 female worker
##Removing NAs columns
Factors <- Factors[!is.na(names(Factors))]
# create factors for future plotting
sample <- as.character(Factors$sample)
age <-factor(Factors$age)
caste <-factor(Factors$caste)
In early steps of data analysis quality assessment (QA) and data exploration are essential and they should precede the normalization step and the DGE testing. Because count values are usually highly skewed, counts are usually log2 transformed.
#Raw counts
matrix_counts = melt(as.matrix(gene_counts))
colnames(matrix_counts) = c('gene_id', 'sample', 'value')
ggplot(matrix_counts, aes(x=value, color=sample)) + geom_histogram(fill = "#525252", binwidth = 2000)
#Log2 transformation
log_counts <- log2(gene_counts + 1)
matrix_counts = melt(as.matrix(log_counts))
colnames(matrix_counts) = c('gene_id', 'sample', 'value')
ggplot(matrix_counts, aes(x=value, color=sample)) + geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6)
Boxplots can also be used to visualise the distribution of counts across the samples
#Naming groups
Factor <- read_tsv("factors.tsv", col_types = cols())
age1 <-factor(Factor$age)
caste1 <-factor(Factor$caste)
# Density plot of raw read counts (log10)
log_counts <- cpm(counts,log=TRUE)
# Check distributions of samples using boxplots and add a blue horizontal line that corresponds to the median logCPM
boxplot(log_counts, xlab="", ylab="Log2 counts per million",las=2)
abline(h=median(log_counts),col="blue")
##Now the density plot (log2)
log_counts <- log2(counts + 1)
matrix_counts = melt(as.matrix(log_counts))
colnames(matrix_counts) = c('gene_id', 'sample', 'value')
ggplot(matrix_counts, aes(x=value, color=sample)) + geom_density()
##By groups
log_counts<-as.matrix(log_counts)
par(mfrow=c(1,2),oma=c(2,0,0,0))
group.col <- c("black","blue")[Factor$caste]
boxplot(log_counts, xlab="", ylab="Log2 counts per million",las=2,col=group.col,
pars=list(cex.lab=0.8,cex.axis=0.8))
abline(h=median(log_counts),col="blue")
title("Boxplots of logCPMs\n(coloured by groups)",cex.main=0.8)
caste.col <- c("red","black", "blue")[Factor$caste]
boxplot(log_counts, xlab="", ylab="Log2 counts per million",las=2, col=caste.col,
pars=list(cex.lab=0.8,cex.axis=0.8))
abline(h=median(log_counts),col="blue")
title("Boxplots of logCPMs by caste",cex.main=0.8)
Another way to explore dissimilarities between samples is using clustering image map (CIM) or heatmaps.
#To reduce volume of data for this tutorial, we will select only the 50 most highly expressed genes
count_matrix <-as.matrix(log_counts)
selected = order(rowMeans(count_matrix), decreasing=TRUE)[1:50]
highexprgenes_counts <- count_matrix[selected,]
#Heatmap
colnames(highexprgenes_counts)<- caste1:age1
heatmap(highexprgenes_counts, col=topo.colors(60), margin=c(10,6))
Hierarchical clustering
# Get variance for genes
varian_genes <- apply(log_counts, 1, var)
#Select the top 50 most variable
selected_varian <- names(sort(varian_genes, decreasing=TRUE))[1:50]
hig_var_lcpm <- log_counts[selected_varian,]
colnames(hig_var_lcpm)<-caste1:age1
dim(hig_var_lcpm)
## [1] 50 36
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)
colnames(hig_var_lcpm)
## [1] "worker:2" "worker:2" "worker:2" "worker:2" "worker:2" "worker:2"
## [7] "worker:4" "worker:4" "worker:4" "worker:4" "worker:4" "worker:4"
## [13] "queen:2" "queen:2" "queen:2" "queen:2" "queen:2" "queen:2"
## [19] "queen:4" "queen:4" "queen:4" "queen:4" "queen:4" "queen:4"
## [25] "drone:2" "drone:2" "drone:2" "drone:2" "drone:2" "drone:2"
## [31] "drone:4" "drone:4" "drone:4" "drone:4" "drone:4" "drone:4"
hig_var_lcpm<-as.matrix(hig_var_lcpm)
# Plot the heatmap
heatmap.2(hig_var_lcpm,col=rev(morecols(50)),trace="none", main="Top 50 most variable genes", ColSideColors=caste.col,scale="row",margins=c(10,5))
A Principal Component Analysis (PCA) can be used to perform a multidimensional scaling of a data and to visualise data clustering.
##Different from heatmap a higher number of genes can be used in this analysis, we will select 1000 most highly expressed genes
select = order(rowMeans(count_matrix), decreasing=TRUE)[1:1000]
highexprgenes_counts <- count_matrix[select,]
# annotate the data with condition group as labels
colnames(highexprgenes_counts)<- caste1:age1
# transpose the data to have variables (genes) as columns
data_PCA <- t(highexprgenes_counts)
dim(data_PCA)
## [1] 36 1000
##Calculate a dissmililatity matrix of the highly expressed gene counts and the proportion of explained variance (Eigen values)
mds <- cmdscale(dist(data_PCA), k=3, eig=TRUE)
# k = the maximum dimension of the space on which the data will be represented
# eig = indicates whether eigenvalues should be returned
mds$eig
## [1] 7.204748e+03 1.692182e+03 1.199125e+03 8.417372e+02 4.091748e+02
## [6] 3.877096e+02 2.358644e+02 2.155563e+02 1.160132e+02 9.766712e+01
## [11] 7.833602e+01 7.053774e+01 5.170816e+01 4.792215e+01 4.130066e+01
## [16] 3.632614e+01 2.861421e+01 2.502223e+01 2.391109e+01 2.236536e+01
## [21] 1.683165e+01 1.498326e+01 1.380908e+01 1.214275e+01 1.090950e+01
## [26] 1.040529e+01 9.709763e+00 8.791781e+00 7.561276e+00 6.884642e+00
## [31] 5.434245e+00 5.205314e+00 4.155790e+00 3.953332e+00 3.315514e+00
## [36] 2.722168e-13
barplot(mds$eig, las=1, xlab="Dimensions", ylab="Proportion of explained variance (%)", y.axis=NULL, col="darkgrey")
##To visualise how many components can explain the variance on the data set, it is easier to plot the data as percentage
# transform the Eigen values into percentage
eig_per <- mds$eig * 100 / sum(mds$eig)
# plot the PCA
barplot(eig_per, las=1, xlab="Dimensions", ylab="Proportion of explained variance (%)", y.axis=NULL, col="darkgrey")
#The first component explains most of the data, but we will use the two components for plotting
## MDS
mds <- cmdscale(dist(data_PCA)) # Performs MDS analysis
#Samples representation
plot(mds[,1], -mds[,2], type="n", xlab="Dimension 1", ylab="Dimension 2", main="")
text(mds[,1], -mds[,2], rownames(mds), cex=0.8)
Next step is the creation of a DGEList object. It is used by edgeR to store parameter of the gene count data. We will add a group name and factors
# annotate the data with condition group as labels
counts<-gene_counts
colnames(counts)<- caste
##Removing "drone" columns
counts <- counts[!is.na(names(counts))]
##Creating a DGE list object
ge_count <- DGEList( gene_counts[, Factors$sample] , group = Factors$caste)
ge_count[,1:5]
## An object of class "DGEList"
## $counts
## SRR3102934 SRR3123272 SRR3123273 SRR3123275 SRR3123276
## LOC552277 3170.27 2056.58 3517.90 4317.39 3025.27
## LOC102655366 17.00 11.00 17.00 14.00 12.00
## LOC102655367 0.00 0.00 0.00 0.00 1.00
## LOC409639 122.00 177.00 122.00 145.00 136.00
## LOC409638 9067.16 4509.87 4409.51 5037.75 7869.44
## 13898 more rows ...
##
## $samples
## group lib.size norm.factors
## SRR3102934 worker 11551080 1
## SRR3123272 worker 12150811 1
## SRR3123273 worker 10867737 1
## SRR3123275 worker 11813538 1
## SRR3123276 worker 11049189 1
names(ge_count)
## [1] "counts" "samples"
One possible bias is the sequencing depth (number of sequenced or mapped reads) of the samples. For example, if sample 1 generates twice as much reads as sample 2, it is likely that individual genes will also be duplicated, inflating DGE results. Contaminants might also cause a similar issue. The type of normalization depending on the data format, for instance, is a specific set of highly-expressed genes accounts for most of the total counts, a global scaling technic will not capture and efficiently correct for differences between high-count genes. To reduce the effect of high-count genes there are several approaches:
# Create a design
design <- model.matrix(~ 0 +caste + age)
head(design)
## castequeen casteworker age4
## 1 0 1 0
## 2 0 1 0
## 3 0 1 0
## 4 0 1 0
## 5 0 1 0
## 6 0 1 0
##First explore data overdispersion using BCV plot
BCV_plot <- estimateDisp(ge_count, design, robust=TRUE)
plotBCV(BCV_plot)
## Plots using a DGEList object
none_norm=voom(ge_count,design,plot=T, normalize="none")
quantile_norm=voom(ge_count,design,plot=T, normalize="quantile")
scale_norm=voom(ge_count,design,plot=T, normalize="scale")
TMM_norm=calcNormFactors(ge_count,method =c("TMM"))
TMM_plot=voom(TMM_norm,design,plot=T)
upquart_norm=calcNormFactors(ge_count,method =c("upperquartile"))
TMM_plot=voom(upquart_norm,design,plot=T)
Before performing the gene expression analysis we need to define a design matrix. Normalise read counts and apply a linear model to the normalised data
# Substitute "caste" from the design column names
colnames(design)<- gsub("caste","",colnames(design))
log_counts<-as.matrix(log_counts)
## Calculate the normalization factors
norm_fact <- calcNormFactors(ge_count)
plotMDS( norm_fact)
#Normalise read counts
voom_norm <- voom(norm_fact,design,plot=TRUE)
par(mfrow=c(1,2))
boxplot(log_counts)
abline(h=median(log_counts),col=4)
boxplot(voom_norm$E)
abline(h=median(voom_norm$E),col=4)
fit <- lmFit(voom_norm,design)
fit <- eBayes(fit)
results <- decideTests(fit)
summary(results)
## queen worker age4
## Down 4278 4193 3480
## NotSig 676 681 5911
## Up 8949 9029 4512
plotMD(fit,column=1,status=results[,2], main=colnames(fit)[1],xlim=c(-8,13))
vennDiagram(results[,1:3],circle.col=c("turquoise", "salmon", "green"))
write.fit(fit, results, file ="results.txt")
The annotation of gene IDs from RNA-seq data can be done using data from databases (BioMart is recommended). We will use our own library.
#Check if the "gene_description" file matches exactly to our fit row name
head(gene_descriptions)
## gene_id description
## 1 18-w 18-wheeler
## 2 5-HT1 serotonin receptor
## 3 5-HT2alpha serotonin receptor
## 4 5-HT2beta serotonin receptor
## 5 5-ht7 serotonin receptor 7
## 6 A4 apolipophorin-III-like protein
##Make sure the rows matches
table(gene_descriptions$gene_id==rownames(fit))
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## Warning in `==.default`(gene_descriptions$gene_id, rownames(fit)): longer
## object length is not a multiple of shorter object length
##
## FALSE TRUE
## 13902 1
##Make them match
gene_descriptions<- gene_descriptions[match(rownames(fit), gene_descriptions$gene_id),]
# Slot the annotation information into the genes slot of fit.
fit$genes <- gene_descriptions
head(fit$genes)
## gene_id
## 10542 LOC552277
## 2957 LOC102655366
## 2958 LOC102655367
## 6896 LOC409639
## 6895 LOC409638
## 6891 LOC409633
## description
## 10542 class E basic helix-loop-helix protein 22
## 2957 DEAD-box ATP-dependent RNA helicase 42-like
## 2958 odorant receptor 4-like
## 6896 UPF0183 protein CG7083
## 6895 elongation of very long chain fatty acids protein AAEL008004-like
## 6891 ras-related and estrogen-regulated growth inhibitor
#Check if the annotation was included in the output
queenvsworker<-topTable(fit,coef=3,sort.by="p") #allocated in "description"
head(queenvsworker)
## gene_id description logFC
## LOC406152 LOC406152 juvenile hormone epoxide hydrolase 1 3.029676
## Hex110 Hex110 hexamerin 110 6.116161
## LOC724838 LOC724838 uncharacterized LOC724838 6.670708
## LOC102656130 LOC102656130 uncharacterized LOC102656130 3.392700
## LOC413627 LOC413627 uncharacterized LOC413627 2.346632
## Obp14 Obp14 odorant binding protein 14 3.862923
## AveExpr t P.Value adj.P.Val B
## LOC406152 7.703536 34.21787 2.774400e-21 2.429328e-17 38.63741
## Hex110 12.741176 33.87091 3.494682e-21 2.429328e-17 38.44119
## LOC724838 8.091747 29.49715 7.927267e-20 3.560324e-16 35.38678
## LOC102656130 5.371113 29.16304 1.024333e-19 3.560324e-16 35.02988
## LOC413627 7.541384 28.36506 1.911091e-19 5.313979e-16 34.58230
## Obp14 10.750023 27.77388 3.066352e-19 7.105250e-16 34.12917
##Check the expression of vitellogenin
ps <- grep("vitellogenin",fit$genes$description)
topTable(fit[ps,],coef=3)
## gene_id description logFC AveExpr t
## Vg Vg vitellogenin 2.6792254 3.030854 8.741570
## LOC725920 LOC725920 vitellogenin receptor 1.0719112 1.684327 6.709615
## LOC726783 LOC726783 vitellogenin-like 0.3489946 -4.462690 1.451985
## P.Value adj.P.Val B
## Vg 8.810936e-09 2.643281e-08 10.061663
## LOC725920 7.487608e-07 1.123141e-06 5.881228
## LOC726783 1.599653e-01 1.599653e-01 -5.200903
##Check the expression of insulin-like peptide 2
ps <- grep("insulin-like peptide",fit$genes$description)
topTable(fit[ps,],coef=3) # positive logFC means upregulated in the queen
## gene_id description logFC AveExpr
## ILP-2 ILP-2 insulin-like peptide 2 1.7395899 2.581413
## LOC411297 LOC411297 insulin-like peptide receptor -0.7756742 5.223722
## t P.Value adj.P.Val B
## ILP-2 12.713302 6.588417e-12 1.317683e-11 17.391559
## LOC411297 -3.721017 1.116803e-03 1.116803e-03 -1.953726
par(mfrow=c(1,2))
plotMD(fit,coef=3,status=results[,"worker"])
volcanoplot(fit,coef=3,highlight=100,names=fit$genes$gene_id)
# View heatmap of top 100 DE genes between Basal and LP cells.
library("gplots")
stripchart(voom_norm$E["ILP-2",]~Factors$caste)
stripchart(voom_norm$E["ILP-2",]~Factors$age)
## Testing differential expression relative to a threshold
fit.treat <- treat(fit,lfc=1)
res.treat <- decideTests(fit.treat)
summary(res.treat)
## queen worker age4
## Down 3620 3526 324
## NotSig 1905 1922 13123
## Up 8378 8455 456
topTreat(fit.treat,coef=3)
## gene_id description logFC
## Hex110 Hex110 hexamerin 110 6.116161
## LOC724838 LOC724838 uncharacterized LOC724838 6.670708
## Hex70c Hex70c hexamerin 70c 8.381430
## LOC406152 LOC406152 juvenile hormone epoxide hydrolase 1 3.029676
## Obp13 Obp13 odorant binding protein 13 6.071151
## LOC410734 LOC410734 glucose dehydrogenase [FAD, quinone] 6.198759
## LOC406147 LOC406147 short-chain dehydrogenase/reductase 8.125742
## Obp14 Obp14 odorant binding protein 14 3.862923
## LOC102656130 LOC102656130 uncharacterized LOC102656130 3.392700
## LOC725489 LOC725489 farnesol dehydrogenase-like 8.049612
## AveExpr t P.Value adj.P.Val
## Hex110 12.741176 28.33297 9.807076e-20 1.363478e-15
## LOC724838 8.091747 25.07526 1.510075e-18 1.049729e-14
## Hex70c 9.436359 24.16301 3.458471e-18 1.602771e-14
## LOC406152 7.703536 22.92364 1.107555e-17 3.517761e-14
## Obp13 4.413167 22.78695 1.265109e-17 3.517761e-14
## LOC410734 6.355519 22.04815 2.621822e-17 6.075198e-14
## LOC406147 7.882328 21.59015 4.179537e-17 8.301158e-14
## Obp14 10.750023 20.58402 1.188547e-16 1.869206e-13
## LOC102656130 5.371113 20.56722 1.210016e-16 1.869206e-13
## LOC725489 2.747490 20.25998 1.689535e-16 2.348961e-13
plotMD(fit.treat,coef=3,status=res.treat[,"worker"])
abline(h=0,col="grey")
Co-expression network analysis is an approach that builds gene networks that have a tendency to appear co-activated across a group of experimental samples. Although gene co-expression networks can be used for various purposes (i.e.identification of regularoty networks, candidate disease gene prioritization, functional gene annotation), co-expression netwroks are only effective on the identification of correlations and thus, there its use on differential (co-expression) analysis is increaseing. A modification of this analysis is the Weighted gene co-expression network (WGCNA). In a weighted network, all genes are assumed to be connected, and such connections have continuous weight values that indicats the strength of co-regulation between the genes, ranging from 0 to 1. The WGCNA pipeline is as follows:
tpm<- read.csv("tpm.csv")
# Manipulate file so it matches the format WGCNA needs
row.names(tpm) = tpm$gene_id
tpm$gene_id = NULL
tpm = as.data.frame(t(tpm)) # # transpose your data (samples as rows and genes as columns)
dim(tpm)
## [1] 36 13903
# Check the presence of outliers
gsg = goodSamplesGenes(tpm, verbose = 3)
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Excluding 819 genes from the calculation due to too many missing samples or zero variance.
## ..step 2
gsg$allOK
## [1] FALSE
#Since the statement did not returned as TRUE, the genes didn't passed the test, remove outliers
if (!gsg$allOK)
{if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(tpm)[!gsg$goodGenes], collapse= ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(tpm)[!gsg$goodSamples], collapse=", ")))
tpm= tpm[gsg$goodSamples, gsg$goodGenes]
}
## Removing genes: Mir6044, Mir3788, Mir3781, Mir3780, Mir3783, Mir3782, Mir3784, Mir3787, Mir3786, LOC552160, LOC102656804, LOC102653813, LOC102653600, Mir6005, Mir6004, Mir6006, LOC102656391, LOC102656026, Mir252a, ERCC-00033, ERCC-00031, ERCC-00035, ERCC-00034, ERCC-00039, Mir3789, Mir927a, LOC107965001, LOC107965006, LOC107965008, Mir9888, Mir9889, LOC107966063, Mir282, Mir283, LOC727111, LOC724140, LOC107964834, LOC107965785, LOC107965780, LOC107965782, LOC107964781, LOC107965871, LOC107965875, LOC107965879, LOC107965878, Mir6067, Mir3785, LOC107965275, LOC107965274, LOC107965276, LOC107964002, LOC107965661, Mir971, LOC102656624, ERCC-00054, ERCC-00057, LOC102655424, ERCC-00051, ERCC-00053, LOC102655423, ERCC-00059, ERCC-00058, LOC102654705, LOC102654706, LOC107964255, LOC102654267, Mir9894, LOC100577961, LOC107964810, Mir1-1, LOC107965857, LOC100576244, LOC107965210, LOC107964060, LOC107964067, ERCC-00111, LOC102653859, Mir6066, LOC724754, Mir3737, Mir6065, Mir3732, LOC102655984, Mir6003, ERCC-00079, ERCC-00078, ERCC-00077, ERCC-00076, ERCC-00075, ERCC-00074, ERCC-00073, ERCC-00071, LOC102655527, LOC100577535, Mir92b-1, Mir3745, LOC102654589, Mir9c, Mir3740, LOC102654326, LOC107964871, LOC107965862, LOC107965867, Mir932, Mir3748, LOC102656168, LOC102656161, LOC107965182, LOC107965187, LOC102654132, LOC107965234, LOC107965984, LOC102654687, LOC102656379, LOC102655772, LOC107965954, LOC107965955, LOC107965957, LOC107965953, ERCC-00092, ERCC-00095, ERCC-00097, ERCC-00096, ERCC-00099, ERCC-00098, LOC107964856, LOC107964855, LOC727301, LOC107965899, LOC107965898, LOC107965895, LOC107965894, LOC107965896, LOC107965890, LOC107965892, LOC413723, LOC102655829, ERCC-00131, ERCC-00136, ERCC-00137, ERCC-00134, Mirlet7, LOC102654153, Mir3769, LOC727291, LOC100578074, LOC102655441, LOC107964620, LOC107965979, LOC102655413, LOC107965974, LOC107965975, Mir3798, Mir3799, LOC102654813, LOC727243, LOC102653906, Mir9884, Mir9885, Mir9886, LOC107965371, LOC107965372, Mir9882, Mir9883, LOC107965709, Mir375, ERCC-00116, ERCC-00117, ERCC-00112, ERCC-00113, LOC102654954, LOC107964557, Mir87-1, LOC102655803, Mir3716a, Mir2-3, Mir2-2, Mir2-1, LOC102656330, LOC107965917, LOC102655163, LOC100577170, LOC102655169, LOC102655344, LOC107964342, LOC102654349, LOC107965763, LOC107965762, LOC107965761, LOC107965870, LOC107965874, ERCC-00170, ERCC-00171, LOC107964578, Mir6001, LOC727564, Mir3723, LOC102653756, LOC107965937, LOC107965935, Mir3720, LOC107965877, LOC102656715, LOC102654433, LOC102655625, Mir3728, LOC102653999, LOC551886, LOC107965103, LOC107965101, LOC107964511, ERCC-00158, ERCC-00150, ERCC-00154, ERCC-00156, ERCC-00157, LOC727231, LOC102654028, LOC107965916, LOC107965482, LOC727325, LOC102653777, Mir2788, LOC726985, LOC100576525, LOC102655386, LOC102655387, LOC102655385, Mir3747b, Mir3747a, LOC107964303, LOC107965900, LOC102654386, LOC102654383, LOC102654382, LOC102654474, Mir190, Mir193, Mir996, LOC107964535, LOC107965462, LOC102653711, Mir3717, LOC102656289, LOC100578431, Mir13b, LOC102656754, Mir13a, LOC107964792, LOC107964498, LOC107964497, LOC726532, LOC102655972, LOC102655504, LOC102655501, Mir316, Mir315, Mir318, Mir9876, Mir31a, Mir133, LOC107965693, LOC727147, LOC102655884, LOC102655887, Mir3749, LOC102655266, LOC107965505, LOC107965996, LOC107965997, Mir2b, LOC100578988, LOC102656771, LOC102654619, Mir1-2, LOC102655695, LOC100576297, LOC727394, Mir3734, Mir3735, Mir3736, Mir3730, Mir3731, Mir3733, Mir3738, Mir3739, Mir92b-2, Mir9868, Mir9869, LOC107964904, LOC102655062, Mir3801, LOC107965835, LOC102654558, LOC107965524, LOC107965527, Mir263b, LOC102655471, LOC102655472, Mir1000, LOC100576910, LOC102656578, LOC102656794, LOC102654293, LOC102654941, LOC100577088, LOC102656895, LOC102655871, Mir252b, Mir137, Mir6047b, LOC102655221, LOC102655951, LOC107965090, LOC107965092, LOC102654574, LOC102654575, LOC107965543, LOC107965546, LOC107965544, LOC107964153, Mir92c, LOC107966019, LOC107966015, LOC107966016, LOC107966010, LOC107966012, LOC107966013, LOC100576939, Mir3761, LOC102656226, LOC102656227, LOC724401, LOC107964717, LOC107964712, LOC102654962, LOC100577416, LOC726911, LOC102654219, LOC107964948, LOC107964943, LOC102653653, LOC727635, LOC100576769, LOC102655021, LOC102656056, LOC727541, LOC102654513, Mir6064, LOC107966039, LOC107966032, LOC107966030, LOC107966031, LOC107966036, Mir2765, LOC725843, Mir3718c, Mir3718b, Apamin, LOC107964770, LOC107965824, LOC107965820, LOC107964414, Mir3752, Mir3753, Mir3750, Mir3751, Mir3754, Mir3755, Mir3759, LOC102654988, LOC726687, LOC102654181, LOC727377, LOC107964966, LOC107965617, Mir6054, Mir6051, LOC102655006, ERCC-00002, ERCC-00003, ERCC-00004, LOC102654537, LOC107965586, LOC107965585, LOC107964199, LOC726769, Mir9875, Mir9874, Mir9877, Mir9871, Mir9870, Mir9873, Mir9879, LOC107964752, LOC107964756, LOC107965808, LOC107965804, LOC102656480, LOC551141, LOC102656878, LOC102656877, bantam, Mir278, LOC107965246, LOC107964034, LOC102653618, Mir750, Mir6038, Mir6039, LOC100578372, Mir9895, ERCC-00028, ERCC-00024, ERCC-00025, ERCC-00022, LOC102654885, LOC107965034, LOC107964200, LOC102655931, Mir9896, LOC107966070, Mir3779, LOC102656700, Mir3796, Mir3797, Mir3792, Mir3790, Mir3791, LOC726739, LOC727600, Mir3768, LOC727178, Mir6012, LOC102656654, LOC107964658, LOC107964650, ERCC-00046, ERCC-00044, ERCC-00042, ERCC-00043, ERCC-00040, ERCC-00041, ERCC-00048, LOC107966006, LOC102654718, LOC102655732, Mir6037, Mir29b, LOC107966097, LOC107966093, LOC107965848, LOC107965847, LOC107964275, Mir71, LOC102655873, LOC102656837, Mir3800, LOC107965202, LOC107965207, LOC107965671, LOC107965672, LOC100578553, Mir279a, LOC107964678, Mir6056, ERCC-00069, ERCC-00060, ERCC-00061, ERCC-00062, ERCC-00067, Mir3478, Mir3477, LOC102654101, LOC102654590, LOC552388, Mir3757, Mir275, Mir277, Mir276, LOC102654338, LOC107964807, LOC107964803, Mir6058, LOC551816, LOC102655853, LOC102655850, LOC102655851, LOC107965229, LOC107965223, LOC102653843, Mir12, Mir10, LOC102656345, Mir3794, Mir3793, ERCC-00083, ERCC-00081, ERCC-00086, ERCC-00084, ERCC-00085, LOC102655192, LOC102655191, Mir3744, Mir3742, LOC102654310, LOC102654311, Mir929, Mir928, LOC107965880, LOC107965886, LOC107965887, LOC102656734, LOC100576601, Obp7, LOC102656405, LOC102656406, LOC102655833, Mir3778, LOC102655954, LOC102653861, LOC102653860, LOC102653865, Mir3770, Mir3772, LOC100577893, Mir3774, Mir3775, LOC107964634, LOC107964635, LOC107965949, LOC102656364, LOC107965944, LOC102656361, LOC102656363, LOC107965940, Mir3777, LOC102654658, Mir8, LOC102654801, Mir7, LOC102655314, Mir6040, Mir6043, Mir6042, Mir6045, Mir6046, LOC107964848, ERCC-00120, ERCC-00123, ERCC-00126, LOC102655817, LOC102654148, LOC102653885, LOC102653888, Mir3773, Mir3719, LOC102655193, Mir3718a, LOC107965969, LOC102656304, Mir993, Mir3795, LOC102655151, Mir281, Mir3776, Mir9b, Mir9a, LOC102656935, LOC107965366, LOC107965365, LOC102654423, LOC107965775, Mir965, LOC102656447, Mir87-2, Miriab-4, ERCC-00104, ERCC-00109, ERCC-00108, LOC107965172, LOC107964568, Mir317, Mir6002, LOC727577, LOC102656321, LOC102656320, LOC107965901, LOC107965907, LOC107965908, Mir9887, LOC102656588, LOC100576577, Mir3756, Mir3758, Mir9880, LOC102653988, LOC102653981, LOC107964354, LOC107964359, Mir9881, LOC102654406, LOC102654407, Mir263a, LOC102656460, ERCC-00165, ERCC-00164, ERCC-00160, ERCC-00163, ERCC-00162, ERCC-00168, Mir6047a, LOC107965903, Mir92a, LOC102655729, LOC107965905, LOC107965904, LOC100578682, LOC102653766, LOC107965920, LOC107965923, LOC107965925, LOC107965924, LOC107965927, Mir279c, Mir279b, Mir279d, LOC107966024, LOC107966028, LOC100577103, LOC102653968, LOC107965326, LOC107964330, LOC102654444, LOC107965971, LOC100578766, LOC550928, ERCC-00148, ERCC-00147, ERCC-00145, ERCC-00144, ERCC-00143, ERCC-00142, LOC102655701, Mir6055, Mir6052, Mir6053, Mir6050, Mir6059, LOC107965966, LOC725561, LOC102655133, LOC102655132, LOC102653943, LOC107964313, Mir184, LOC725235, Mir3715, Mir985, Mir981, Mir980, Mir989, ERCC-00009, Mir927b, Mir3049, LOC102653725, LOC727484, Mir2796, Mir14, Mir11, LOC107965945, LOC107965943, LOC107965942, LOC107965941, Mir3727, Mir3724, LOC102655517, Mir9893, Mir305, Mir307, Mir306, Mir9892, Mir9891, Mir9890, LOC100578856, LOC102656193, LOC727368, Mir3771, LOC727410, Mir1175, LOC102654035, LOC102654034, LOC107965989, LOC107965983, LOC107965982, LOC107965980, LOC107965986, LOC107965948, LOC102654670, Mir34, Mir33, LOC102656503, Mir6057, LOC102655533, LOC102656889, ERCC-00138, Mir9872, LOC100577427, ERCC-00130, Mir210, LOC726926, Mir9878, Mir2944, LOC100576456, LOC100576455, LOC102655949, LOC102654019, LOC102654547, LOC107964141, LOC107964142, LOC107965432, LOC107965437, LOC102656561, LOC102656781, Mcdp, LOC107964706, Mir1006, Mir3763, Mir3762, Mir3760, Mir3767, Mir3766, Mir3765, Mir3764, LOC102655558, LOC727587, Mir125, Mir124, LOC102654203, Mir6063, Mir6062, Mir6061, Mir6060, LOC102655036, LOC100577391, LOC102655962, LOC102655963, LOC107964162, LOC100577022, LOC107966009, LOC107966008, LOC107966007, LOC107966005, LOC107966004, LOC107966001, LOC102656525, Mir79, LOC102656545, LOC412188, LOC100578488, Mir3716b, LOC107964768, LOC107964429, LOC107965386, Mir3746, Mir3741, Mir3743, LOC102655579, LOC102655571, LOC102655570, LOC102655573, LOC102654997, Mir6000b, Mir6000a, Mir100, Mir6049, Mir6048, LOC107965604, LOC102656687, LOC102656680, LOC727315, LOC107966027, LOC107966020, LOC107966023, LOC107966022, LOC107965973, LOC107965970, LOC678674, LOC102656219, LOC107965638, LOC102655599, LOC102656867, LOC107964973, LOC107964974, LOC102653623, LOC107965629, Mir3726, Mir3725, Mir3722, ERCC-00019, Mir3721, ERCC-00013, ERCC-00012, ERCC-00014, ERCC-00017, ERCC-00016, Mir3729, LOC102654744, LOC102654742, LOC102655293, LOC102654529, Mir9866, Mir9867, Mir9864, Mir9865, LOC107966046, Mir219, Mir6041, LOC102656273
Factor <- read.csv("factors.csv")
##Data description file
rownames(Factor) = Factor$sample
Factor$sample = NULL
#Check if samples from tpm file align with "Factor" file
table(rownames(Factor)==rownames(tpm))
##
## TRUE
## 36
#Estimating network connectivity
A = adjacency(t(tpm),type="signed")
k = as.numeric(apply(A,2,sum))-1 # standardized connectivity
Z.k = scale(k)
thresholdZ.k = -2.5
outlierColor = ifelse(Z.k<thresholdZ.k,"red","black")
sampleTree = flashClust(as.dist(1-A), method = "average")
#Convert traits as colours, high values will be indicated in red
traitColors = data.frame(numbers2colors(Factor$age,signed=FALSE))
datColors = data.frame(outlier = outlierColor,traitColors)
plotDendroAndColors(sampleTree,groupLabels=names(datColors),
colors=datColors,main="Age Dendrogram and Trait Heatmap")
Age 2 outlier was identified
# Remove outlying samples
#remove.samples = Z.k<thresholdZ.k | is.na(Z.k)
#tpmOut = tpm[!remove.samples,]
#FactorOut = Facrors[!remove.samples,]
#save(tpmOut, FactorOut, file="SamplesAndTraits_OutliersRemoved.RData")
Choose a soft threshold power
#Set a set of soft-thresholding power
powers = c(c(1:10), seq(from =10, to=30, by=1))
#Call network topology analysis function
sft = pickSoftThreshold(tpm, powerVector=powers, verbose =5, networkType="signed")
## pickSoftThreshold: will use block size 3419.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 3419 of 13084
## Warning: executing %dopar% sequentially: no parallel backend registered
## ..working on genes 3420 through 6838 of 13084
## ..working on genes 6839 through 10257 of 13084
## ..working on genes 10258 through 13084 of 13084
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.9950 4.010 0.994 7700.0 8010.0 9240
## 2 2 0.9100 1.070 0.910 4960.0 5190.0 7130
## 3 3 0.0756 0.097 -0.185 3420.0 3520.0 5770
## 4 4 0.3660 -0.292 0.188 2480.0 2470.0 4820
## 5 5 0.5660 -0.464 0.442 1870.0 1780.0 4120
## 6 6 0.6400 -0.569 0.538 1460.0 1310.0 3580
## 7 7 0.6740 -0.636 0.582 1170.0 981.0 3150
## 8 8 0.7090 -0.682 0.632 957.0 745.0 2810
## 9 9 0.7410 -0.715 0.679 797.0 573.0 2520
## 10 10 0.7440 -0.754 0.696 674.0 446.0 2290
## 11 10 0.7440 -0.754 0.696 674.0 446.0 2290
## 12 11 0.7520 -0.786 0.716 576.0 349.0 2090
## 13 12 0.7420 -0.819 0.721 498.0 276.0 1920
## 14 13 0.7450 -0.846 0.736 435.0 221.0 1770
## 15 14 0.7470 -0.871 0.751 382.0 177.0 1640
## 16 15 0.7580 -0.894 0.775 338.0 143.0 1520
## 17 16 0.7540 -0.917 0.783 301.0 116.0 1420
## 18 17 0.7540 -0.938 0.792 269.0 94.7 1330
## 19 18 0.7640 -0.948 0.814 242.0 78.1 1240
## 20 19 0.7580 -0.970 0.814 218.0 64.2 1170
## 21 20 0.7690 -0.982 0.833 197.0 53.6 1100
## 22 21 0.7800 -0.993 0.850 179.0 45.5 1040
## 23 22 0.7850 -1.010 0.861 164.0 38.5 979
## 24 23 0.7880 -1.030 0.868 150.0 33.1 926
## 25 24 0.7870 -1.040 0.875 137.0 28.6 878
## 26 25 0.7920 -1.050 0.883 126.0 24.7 833
## 27 26 0.7990 -1.070 0.894 116.0 21.7 791
## 28 27 0.8000 -1.080 0.899 107.0 18.9 753
## 29 28 0.7950 -1.090 0.899 99.2 16.7 717
## 30 29 0.8000 -1.100 0.906 91.9 14.6 684
## 31 30 0.7950 -1.110 0.907 85.4 13.0 652
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab= "Soft Threshold (power)", ylab="Scale Free Topology Model Fit, signed R^2", type= "n", main= paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers, col="red")
abline(h=0.90, col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab= "Soft Threshold (power)", ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, col="red")
Construct a gene co-expression matrix and generate modules
# Building an adjacency "correlation" matrix
enableWGCNAThreads()
## Allowing parallel execution with up to 3 working processes.
softPower = 18
adjacency = adjacency(tpm, power = softPower, type = "signed")
Construct Networks
#Translate the adjacency into topological overlap matrix and calculate the corresponding dissimilarity
TOM = TOMsimilarity(adjacency, TOMType="signed") # Specify network type
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM
Generate Modules
# Generate a clustered gene tree
geneTree = flashClust(as.dist(dissTOM), method="average")
plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE, hang=0.04)
#This sets the minimum number of genes to cluster into a module
minModuleSize = 30
dynamicMods = cutreeDynamic(dendro= geneTree, distM= dissTOM, deepSplit=2, pamRespectsDendro= FALSE, minClusterSize = minModuleSize)
## ..cutHeight not given, setting it to 0.996 ===> 99% of the (truncated) height range in dendro.
## ..done.
dynamicColors= labels2colors(dynamicMods)
MEList= moduleEigengenes(tpm, colors= dynamicColors,softPower = 18)
MEs= MEList$eigengenes
MEDiss= 1-cor(MEs)
METree= flashClust(as.dist(MEDiss), method= "average")
save(dynamicMods, MEList, MEs, MEDiss, METree, file= "Network_allSamples_signed_RLDfiltered.RData")
#Plots tree showing the eigengenes clusters
plot(METree, main= "Clustering of module eigengenes", xlab= "", sub= "")
#Set a threhold for merging modules.
MEDissThres = 0.0
merge = mergeCloseModules(tpm, dynamicColors, cutHeight= MEDissThres, verbose =3)
## mergeCloseModules: Merging modules whose distance is less than 0
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 45 module eigengenes in given set.
## Calculating new MEs...
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 45 module eigengenes in given set.
mergedColors = merge$colors
mergedMEs = merge$newMEs
mergedColors = merge$colors
mergedMEs = merge$newMEs
#plot dendrogram with module colors below it
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels= FALSE, hang=0.03, addGuide= TRUE, guideHang=0.05)
moduleColors = mergedColors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs
Correlate traits
#Defining the number of genes and samples
nGenes = ncol(tpm)
nSamples = nrow(tpm)
#Recalculate MEs adding the color labels
MEs0 = moduleEigengenes(tpm, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, Factor, use= "p")
## Warning in storage.mode(y) = "double": NAs introduced by coercion
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
# Print the correlation heatmap between modules and traits
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
par(mar= c(6, 8.5, 3, 3))
# Display the corelation values with a heatmap plot
labeledHeatmap(Matrix= moduleTraitCor,
xLabels= names(Factor),
yLabels= names(MEs),
ySymbols= names(MEs),
colorLabels= FALSE,
colors= blueWhiteRed(50),
textMatrix= textMatrix,
setStdMargins= FALSE,
cex.text= 0.5,
zlim= c(-1,1),
main= paste("Module-trait relationships"))
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.13.6 (unknown)
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] kableExtra_0.9.0 flashClust_1.01-2 WGCNA_1.63
## [4] fastcluster_1.1.25 dynamicTreeCut_1.63-1 bindrcpp_0.2.2
## [7] gage_2.22.0 plotly_4.8.0 data.table_1.11.4
## [10] rGTExNetwork_0.0.1 RColorBrewer_1.1-2 preprocessCore_1.34.0
## [13] GSEABase_1.34.1 annotate_1.50.1 XML_3.98-1.12
## [16] gplots_3.0.1 RSQLite_2.1.1.9001 GOstats_2.38.1
## [19] graph_1.50.0 Category_2.38.0 Matrix_1.2-6
## [22] AnnotationDbi_1.34.4 IRanges_2.6.1 S4Vectors_0.10.3
## [25] Biobase_2.32.0 BiocGenerics_0.18.0 edgeR_3.14.0
## [28] limma_3.28.21 magrittr_1.5 statmod_1.4.30
## [31] devtools_1.13.6 forcats_0.3.0 stringr_1.3.1
## [34] dplyr_0.7.6 purrr_0.2.5 readr_1.2.0
## [37] tidyr_0.8.1 tibble_1.4.2 ggplot2_3.0.0
## [40] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 rprojroot_1.3-2 htmlTable_1.12
## [4] XVector_0.12.1 base64enc_0.1-3 rstudioapi_0.7
## [7] bit64_0.9-7 mvtnorm_1.0-5 lubridate_1.7.4
## [10] xml2_1.2.0 codetools_0.2-15 splines_3.3.1
## [13] doParallel_1.0.11 robustbase_0.92-6 impute_1.46.0
## [16] knitr_1.20 Formula_1.2-3 jsonlite_1.5
## [19] broom_0.5.0 cluster_2.0.4 GO.db_3.3.0
## [22] png_0.1-7 pheatmap_1.0.10 rrcov_1.4-3
## [25] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
## [28] lazyeval_0.2.1 cli_1.0.0 acepack_1.4.1
## [31] htmltools_0.3.6 tools_3.3.1 gtable_0.2.0
## [34] glue_1.3.0 reshape2_1.4.3 Rcpp_0.12.18
## [37] cellranger_1.1.0 Biostrings_2.40.2 gdata_2.18.0
## [40] nlme_3.1-128 iterators_1.0.10 rvest_0.3.2
## [43] gtools_3.8.1 DEoptimR_1.0-8 zlibbioc_1.18.0
## [46] MASS_7.3-50 scales_0.5.0 hms_0.4.2.9001
## [49] RBGL_1.48.1 yaml_2.2.0 memoise_1.1.0
## [52] gridExtra_2.3 rpart_4.1-13 latticeExtra_0.6-28
## [55] stringi_1.2.4 genefilter_1.54.2 pcaPP_1.9-72.1
## [58] foreach_1.4.4 checkmate_1.8.5 caTools_1.17.1.1
## [61] rlang_0.2.1 pkgconfig_2.0.1 bitops_1.0-6
## [64] matrixStats_0.54.0 evaluate_0.11 lattice_0.20-35
## [67] bindr_0.1.1 labeling_0.3 htmlwidgets_1.2
## [70] robust_0.4-18 bit_1.1-14 tidyselect_0.2.4
## [73] AnnotationForge_1.14.2 plyr_1.8.4 R6_2.2.2
## [76] fit.models_0.5-14 Hmisc_4.1-1 DBI_1.0.0
## [79] pillar_1.3.0 haven_1.1.2 foreign_0.8-71
## [82] withr_2.1.2 KEGGREST_1.12.3 survival_2.42-6
## [85] RCurl_1.95-4.11 nnet_7.3-12 modelr_0.1.2
## [88] crayon_1.3.4 KernSmooth_2.23-15 rmarkdown_1.10
## [91] grid_3.3.1 readxl_1.1.0 blob_1.1.1
## [94] digest_0.6.15 xtable_1.8-2 munsell_0.5.0
## [97] viridisLite_0.3.0